##Functions
#Plot functions
plotTheme <- theme(
  plot.title =element_text(size=12),
  plot.subtitle = element_text(size=8),
  plot.caption = element_text(size = 6),
  axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
  axis.text.y = element_text(size = 10),
  axis.title.y = element_text(size = 10),
  # Set the entire chart region to blank
  panel.background=element_blank(),
  plot.background=element_blank(),
  #panel.border=element_rect(colour="#F0F0F0"),
  # Format the grid
  panel.grid.major=element_line(colour="#D0D0D0",size=.2),
  axis.ticks=element_blank())

mapTheme <- function(base_size = 12) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(size = 16,colour = "black"),
    plot.subtitle=element_text(face="italic"),
    plot.caption=element_text(hjust=0),
    axis.ticks = element_blank(),
    panel.background = element_blank(),axis.title = element_blank(),
    axis.text = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=2),
    strip.text.x = element_text(size = 14))
}


New York City Bike Infrastructure & Cycling Patterns

Use Case

Our client is the New York City Department of Transportation. They have shared two datasets with us - LION: street centerlines over time - Bike Trips: linestrings of select trips taken using NYC’s bikeshare system 2018-2021

Our goal is to use these two datasets to analyze the impact of bicycle infrastructure changes on the routes chosen by cyclists over time. We will analyze bike ridership on streets that have had new bike lanes added during our study period (2018-2020) before and after the infrastructure investment. We plan to use a difference in differences approach to calculate the estimate the impact of a particular bike lane on changing ridership patterns.

Bike Lane Analysis

Types of Bike Lanes

There are three main categories of bike lane in New York City’s LION data: Protected Lanes, Unprotected Lanes, and Sharrows. A fourth category, Combination, is comprised of street segments with a combination of the other three bike lane categories.

Fig1.Bike Lane Types in New York City

Fig1.Bike Lane Types in New York City


Unprotected bike lanes are the most common type found in New York City, while the borough of the Bronx has the highest bike lane mileage of any borough.

Fig2.Bike Lanes by Type

Fig2.Bike Lanes by Type

Fig3.Bike Lanes by Borough

Fig3.Bike Lanes by Borough


From glancing at a map of these lanes across the city, we get a sense that the lane types are not evenly distributed throughout the 5 boroughs of New York City.

#import bike lane data
bikelane_17d<- st_read("E:/NYCDOT/Feb23/data/bikelane_17d/bikelane_17d.shp")
bikelane_20d<- st_read("E:/NYCDOT/Feb23/data/bikelane_17d/bikelane_20d.shp")


new_18 <- st_read("E:/NYCDOT/Feb23/data/bikelane_1718Diff_2/bikelane_1718Diff_2.shp")
new_19 <- st_read("E:/NYCDOT/Feb23/data/bikelane_1819Diff_2/bikelane_1819Diff_2.shp")
new_20 <- st_read("E:/NYCDOT/Feb23/data/bikelane_1920Diff_2/bikelane_1920Diff_2.shp")


boroughs<- st_read("E:/NYCDOT/Feb23/data/borough.geojson")
ntas<- st_read("E:/NYCDOT/Feb23/data/ntas.geojson")

parks<- st_read("E:/NYCDOT/Feb23/data/parks_nyc/parks_nyc.shp")
big_parks<-subset(parks, parks$acres>30)

#get rid of excess columns
new_18<- new_18%>%
  select(120:242)

new_19<- new_19%>%
  select(120:241)

new_20<- new_20%>%
  select(120:244)

# add lane category
new_18$LaneType <- '3: Combination'
new_18$LaneType[new_18$BikeLane_1=='1']<- '1: Protected Lane'
new_18$LaneType[new_18$BikeLane_1=='2']<- '2: Unprotected Lane'
new_18$LaneType[new_18$BikeLane_1=='3']<- '4: Sharrow'

new_19$LaneType <- '3: Combination'
new_19$LaneType[new_19$BikeLane_1=='1']<- '1: Protected Lane'
new_19$LaneType[new_19$BikeLane_1=='2']<- '2: Unprotected Lane'
new_19$LaneType[new_19$BikeLane_1=='3']<- '4: Sharrow'

new_20$LaneType <- '3: Combination'
new_20$LaneType[new_20$BikeLane_1=='1']<- '1: Protected Lane'
new_20$LaneType[new_20$BikeLane_1=='2']<- '2: Unprotected Lane'
new_20$LaneType[new_20$BikeLane_1=='3']<- '4: Sharrow'

bikelane_17d$LaneType <- '3: Combination'
bikelane_17d$LaneType[bikelane_17d$BikeLane=='1']<- '1: Protected Lane'
bikelane_17d$LaneType[bikelane_17d$BikeLane=='2']<- '2: Unprotected Lane'
bikelane_17d$LaneType[bikelane_17d$BikeLane=='3']<- '4: Sharrow'


bikelane_20d$LaneType <- '3: Combination'
bikelane_20d$LaneType[bikelane_20d$BikeLane=='1']<- '1: Protected Lane'
bikelane_20d$LaneType[bikelane_20d$BikeLane=='2']<- '2: Unprotected Lane'
bikelane_20d$LaneType[bikelane_20d$BikeLane=='3']<- '4: Sharrow'


#add boro name
bikelane_17d$Borough <- 'Manhattan'
bikelane_17d$Borough[bikelane_17d$LBoro=='2']<- 'Bronx'
bikelane_17d$Borough[bikelane_17d$LBoro=='3']<- 'Brooklyn'
bikelane_17d$Borough[bikelane_17d$LBoro=='4']<- 'Queens'
bikelane_17d$Borough[bikelane_17d$LBoro=='5']<- 'Staten Island'

bikelane_20d$Borough <- 'Manhattan'
bikelane_20d$Borough[bikelane_20d$LBoro=='2']<- 'Bronx'
bikelane_20d$Borough[bikelane_20d$LBoro=='3']<- 'Brooklyn'
bikelane_20d$Borough[bikelane_20d$LBoro=='4']<- 'Queens'
bikelane_20d$Borough[bikelane_20d$LBoro=='5']<- 'Staten Island'

new_18$Borough <- 'Manhattan'
new_18$Borough[new_18$LBoro_1=='2']<- 'Bronx'
new_18$Borough[new_18$LBoro_1=='3']<- 'Brooklyn'
new_18$Borough[new_18$LBoro_1=='4']<- 'Queens'
new_18$Borough[new_18$LBoro_1=='5']<- 'Staten Island'

new_19$Borough <- 'Manhattan'
new_19$Borough[new_19$LBoro_1=='2']<- 'Bronx'
new_19$Borough[new_19$LBoro_1=='3']<- 'Brooklyn'
new_19$Borough[new_19$LBoro_1=='4']<- 'Queens'
new_19$Borough[new_19$LBoro_1=='5']<- 'Staten Island'

new_20$Borough <- 'Manhattan'
new_20$Borough[new_20$LBoro_1=='2']<- 'Bronx'
new_20$Borough[new_20$LBoro_1=='3']<- 'Brooklyn'
new_20$Borough[new_20$LBoro_1=='4']<- 'Queens'
new_20$Borough[new_20$LBoro_1=='5']<- 'Staten Island'

#if sum of road <1000 ft, drop all segments with that road name
new_18_clean<- new_18%>%
  group_by(Street_1, LaneType)%>%
  summarize(Length=sum(SHAPE_Le_1),
            Borough=first(Borough))%>%
  filter(Length>1000)

new_19_clean<- new_19%>%
  group_by(Street_1, LaneType)%>%
  summarize(Length=sum(SHAPE_Le_1),
            Borough=first(Borough))%>%
  filter(Length>1000)

new_20_clean<- new_20%>%
  group_by(Street_1, LaneType)%>%
  summarize(Length=sum(SHAPE_Le_1),
            Borough=first(Borough))%>%
  filter(Length>1000)

#combine all new lanes into one dataframe (not filtered by length)
names(bikelane_17d)[names(bikelane_17d) == "SHAPE_Leng"] <- "Length"
names(new_18)[names(new_18) == "SHAPE_Le_1"] <- "Length"
names(new_19)[names(new_19) == "SHAPE_Le_1"] <- "Length"
names(new_20)[names(new_20) == "SHAPE_Le_1"] <- "Length"

names(new_18)[names(new_18) == "Street_1"] <- "Street"
names(new_19)[names(new_19) == "Street_1"] <- "Street"
names(new_20)[names(new_20) == "Street_1"] <- "Street"

names(new_18)[names(new_18) == "SegmentID_"] <- "SegmentID"
names(new_19)[names(new_19) == "SegmentID_"] <- "SegmentID"
names(new_20)[names(new_20) == "SegmentID_"] <- "SegmentID"

bikelane_17d$Year<- 2017
new_18$Year<- 2018
new_19$Year<- 2019
new_20$Year<- 2020

lane_17d_st<-bikelane_17d%>%
  as.data.frame()%>%
  select(Street, SegmentID, LaneType, Length, Borough, Year)

new_18_st<-new_18%>%
  as.data.frame()%>%
  select(Street, SegmentID, LaneType, Length, Borough, Year)

new_19_st<-new_19%>%
  as.data.frame()%>%
  select(Street, SegmentID, LaneType, Length, Borough, Year)

new_20_st<-new_20%>%
  as.data.frame()%>%
  select(Street, SegmentID, LaneType, Length, Borough, Year)


combo<- rbind(lane_17d_st, new_18_st, new_19_st, new_20_st)
combo_sf<-inner_join(bikelane_20d, combo, by='SegmentID')

combo_2<- combo%>%
  count(Year=factor(Year))%>%
  mutate(pct=prop.table(n))


Indeed, Manhattan hosts the largest share of protected bike lanes, while in Brooklyn, Staten Island, and the Bronx, unprotected lanes are the most common.

Fig5.Bike Lanes by Borough and Type

Fig5.Bike Lanes by Borough and Type


Identifying New Bike Lanes


Our trip data begins in January 2018, so it is important to identify the bike lanes that have been added to the network since 2017.

Data Cleaning


To find the difference between bike lane files across time, we used the Symmetrical Differences tool in ArcGIS. Though this provided more accurate results than the method done in R, there was still an issue. Some stretches of road were represented geographically differntly (eg with 3 segments instead of 4), which resulted in many small stretches of street that were erroneously identified as having new bike lanes. To remedy this, we filtered out streets with less than 1000 ft of additional bike lane.

New Lanes


Only 15% of the current bike lane mileage in New York City was added during the study period. The majority of this was added during 2020, which may pose a challenge to generalizability (due to changing ridership patterns during COVID-19) or the volume of trip data available after the infrastructure investment.

Looking at the map for each year more closely reveals that 2018 and 2019 saw a limited number of corridors where significant stretches of new bike lane were added, particularly when one excludes sharrows, which offer minimal protection to cyclists.


Citibike Extent


Another complicating factor is which bike lane additions fall within the citibike network during the study period. We are still waiting to get data on station locations over time, but at least in the present day, the citibike network is still fairly limited. Many parts of Brooklyn, Queens, and the Bronx are not served, and there are no stations in Staten Island.

Fig11.NYC Citibike Network Extent 2021

Fig11.NYC Citibike Network Extent 2021


Picking Some Streets


The tables below show the streets in each year with the longest bike lane additions. Staten Island is filtered out entirely since there are no Citibike stations there.

Table1.Longest New Bike Lanes in 2018
Street_1 Borough LaneType Length
PARSONS BOULEVARD Queens 2: Unprotected Lane 14908.006
QUEENS BOULEVARD Queens 1: Protected Lane 13355.781
MARINE PY-GIL HODGES BR BIKE PTH Brooklyn 3: Combination 7590.816
EAST 111 STREET Manhattan 2: Unprotected Lane 6168.733
7 AVENUE Manhattan 1: Protected Lane 5710.721
BIKE PATH Bronx 1: Protected Lane 5689.386
SHORE ROAD Brooklyn 2: Unprotected Lane 5455.624
150 STREET Queens 2: Unprotected Lane 5287.694
EAST 128 STREET Manhattan 2: Unprotected Lane 5209.892
BOARDWALK Queens 1: Protected Lane 4975.210
2 AVENUE Manhattan 1: Protected Lane 4244.052
EAST 225 STREET Bronx 2: Unprotected Lane 4215.968
EAST 226 STREET Bronx 2: Unprotected Lane 4100.745
NORTHERN BOULEVARD Queens 1: Protected Lane 4024.241
EAST 216 STREET Bronx 2: Unprotected Lane 3821.516
Table2.Longest New Bike Lanes in 2019
Street_1 Borough LaneType Length
SHORE PARKWAY GREENWAY Brooklyn 1: Protected Lane 8061.755
BOBBY WAGNER WALK Manhattan 1: Protected Lane 6469.601
CITY ISLAND ROAD GREENWAY Bronx 1: Protected Lane 4959.806
JOHN FINLEY WALK Manhattan 1: Protected Lane 3717.356
BROOKLYN WATERFRONT GREENWAY Brooklyn 1: Protected Lane 2872.268
EAST DRIVE Brooklyn 1: Protected Lane 2415.813
SHORE PARKWAY GREENWAY Brooklyn 3: Combination 2347.252
EAST RIVER ESPLANADE Manhattan 1: Protected Lane 2186.942
CONCRETE PLANT PARK GREENWAY Bronx 1: Protected Lane 1976.764
BOARDWALK WEST Brooklyn 1: Protected Lane 1717.649
BUSH TERMINAL GREENWAY Brooklyn 1: Protected Lane 1710.332
BARETTO POINT PARK GREENWAY Bronx 1: Protected Lane 1691.917
ALLEY PARK POND GREENWAY Queens 1: Protected Lane 1688.812
PLAY WAY Manhattan 1: Protected Lane 1651.468
FLATBUSH AVENUE GREENWAY Brooklyn 1: Protected Lane 1642.187
UNIVERSITY AVENUE Bronx 4: Sharrow 1629.931
TIFFANY STREET Bronx 3: Combination 1394.401
EDGECOMBE AVENUE Manhattan 4: Sharrow 1310.510
OUTLOOK WAY Manhattan 1: Protected Lane 1236.022
FDR DRIVE GREENWAY Manhattan 1: Protected Lane 1168.713
Table3.Longest New Bike Lanes in 2020
Street_1 Borough LaneType Length
AMSTERDAM AVENUE Manhattan 2: Unprotected Lane 17662.930
ORCHARD BEACH BOARDWALK Bronx 1: Protected Lane 13901.675
FOUNTAIN AVENUE Brooklyn 1: Protected Lane 11231.848
WILLETS POINT BOULEVARD Queens 2: Unprotected Lane 10584.591
SANFORD AVENUE Queens 2: Unprotected Lane 10319.952
BROADWAY Bronx 2: Unprotected Lane 9658.692
3 AVENUE Bronx 2: Unprotected Lane 9533.877
EAST 12 STREET Manhattan 1: Protected Lane 8514.923
CLERMONT AVENUE Brooklyn 2: Unprotected Lane 8338.833
SEAVIEW AVENUE Staten Island 2: Unprotected Lane 8287.891
WEST 52 STREET Manhattan 1: Protected Lane 7786.272
STARR STREET Brooklyn 2: Unprotected Lane 7553.790
MORRIS PARK AVENUE Bronx 2: Unprotected Lane 7050.729
81 STREET Queens 2: Unprotected Lane 6946.115
HARMAN STREET Brooklyn 2: Unprotected Lane 6797.925
EAST 3 STREET Manhattan 2: Unprotected Lane 6772.666
EAST 4 STREET Manhattan 2: Unprotected Lane 6769.554
WEST 26 STREET Manhattan 1: Protected Lane 6718.947
HIMROD STREET Brooklyn 2: Unprotected Lane 6705.151
TROUTMAN STREET Brooklyn 2: Unprotected Lane 6646.425

In 2018, the streets with the longest additions of bike lane that also fall within the Citibike network were all in Manhattan: 7th Avenue, E 111th St, and E 128th St. We have mostly analyzed 2018 and 2019 trip data, so we choose to focus on two neighborhoods for our exploratory analysis of trips: East Harlem and Chelsea.

Trip Data Analysis

Data Cleaning


There are several steps that we have used to clean our trip data, including both cleaning data fall outside of New York City, and wrong trajectories inside of city.

Clip Trip Data


There are two types of data that are fall outside of New York City. We pick trips in March5th to demonstrate how these data look like. The first type is that there are some data located in Montreal, like the following image shows.
Fig12.Wrong trajectories falling outisde of NYC-1

Fig12.Wrong trajectories falling outisde of NYC-1


Another type of wrong trajectories outside of NYC are trips that are connected to another country, like the following image:
Fig13.Wrong trajectories falling outisde of NYC-2

Fig13.Wrong trajectories falling outisde of NYC-2


In order to clean up these data, we use both clip and spatial intersection. To illustrate, the first type of data could be filtered out by clipping the trip data which are inside NYC boundary. However, for the second type of data, we find out the intersected points of trajectories with the NYC boundary, and then filter out those trips with their unique id. Due to the size of data, we import data which are already cleaned up in the later part, and provide the codes that we used to clean data.

#sample code that we used to clean up data
##Get NYC data
#Reference: http://gis.ny.gov/gisdata/inventories/details.cfm?DSID=927
#NYC <- st_read("E:/NYCDOT/NYS_Civil_Boundaries_SHP/Cities_Towns.shp") %>%
  #st_transform(st_crs(March)) %>%
  #filter(NAME == "New York")

#NYC_line <- st_cast(NYC, "MULTILINESTRING")

#Marchintersect <- March[NYC_line,]

#March18_cleaned <- March[NYC, ] %>%
  #filter(! id %in% Marchintersect$id)

There are also trips inside NYC that have strange trajectories. We plan to filter out these data by using several steps.
  1. Buffering data of bike lanes.
  2. Transfer geometries of trips data from linestring to points.
  3. Filter out trip data which overlay with the buffered bike lanes.
  4. Gorup the points of trip data which are in the same route and have same id, which is identical to each trip.


15 feet buffer


Due to the size of data and hope to keep associated attribute table, we buffer the road data in ArcGIS. During the process of buffering, there is also trade-off:
Fig14.Buffer Choice

Fig14.Buffer Choice


Later, we will use the buffer of bike lanes to find out overlapped trip points. However, as the figure shows, when the buffer becomes wider, there are more overlaps between bike lanes. Sometimes, the trajectories of trips do not fully overlap with the road because the road buffer is not wide enough to cover all the trips. In other words, choosing thinner buffer has potential to loss more data, but choosing thicker buffer brings problems of overlapping roads. In the consideration of this trade-off, we choose to use 15-feet buffer for bike lanes.

Line to Point


Similarly, due to the size of dataset and we would like to keep associated attributes of data, we use Geoprocessing Tool, Generate Pointes Along Lines, in ArcGIS to transfer our geometries from linestring to points. In order to keep the progress consistent, we make the Point Placement by 150 meters. In other words, for one trajectory of bike trip, there will be a point by each 150-meter, including the end points.

Trips on road


Then we use spatial intersect to allocate Street Name on trip points. If the trip points are intersect with buffered lanes, they will be assigned to a Street Name. But there is condition that one trip point will locate on several roads, at the condition when the roads are overlapped with each other at nodes.
Fig15.Point on Several Roads

Fig15.Point on Several Roads


Like the point shown in the figure. It is located at the node where LEXINGTON AVENUE LINE, EAST 77 STREET, and LENOX HILL HOSPITAL are overlapped. However, the spatial intersect within st_join function only assign one road name on that point. This process is random but is acceptable because it does not assign multiple Street Name on the point.

Exploratory Analysis

Yearly Trend


We would like to at first have a sense about how the trip data vary over year.

March2018 <- st_read("E:/NYCDOT/rideData_cleaned/March2018/March18_cleaned.shp")
Apr2018 <- st_read("E:/NYCDOT/rideData_cleaned/Apr2018/Apr18_cleaned.shp")
May2018 <- st_read("E:/NYCDOT/rideData_cleaned/May2018/May18_cleaned.shp")
Jun2018 <- st_read("E:/NYCDOT/rideData_cleaned/Jun2018/June18_cleaned.shp")
July2018 <- st_read("E:/NYCDOT/rideData_cleaned/July2018/July18_cleaned.shp")
Aug2018 <- st_read("E:/NYCDOT/rideData_cleaned/Aug2018/Aug18_cleaned.shp")
Sep2018 <- st_read("E:/NYCDOT/rideData_cleaned/Sep2018/Sep18_cleaned.shp")
Oct2018 <- st_read("E:/NYCDOT/rideData_cleaned/Oct2018/Oct18_cleaned.shp")
Nov2018 <- st_read("E:/NYCDOT/rideData_cleaned/Nov2018/Nov18_cleaned.shp")
Dec2018 <- st_read("E:/NYCDOT/rideData_cleaned/Dec2018/Dec18_cleaned.shp")


1.Daily Trend
dayTrend2018 <- rbind(March2018 %>% 
        as.data.frame() %>% 
        group_by(day) %>%
        tally() %>%
        mutate(month = 3), 
      Apr2018 %>%
        as.data.frame() %>%
        group_by(day) %>%
        tally() %>%
        mutate(month = 4),
      May2018 %>%
        as.data.frame() %>%
        group_by(day) %>%
        tally() %>%
        mutate(month = 5),
      Jun2018 %>%
        as.data.frame() %>%
        group_by(day) %>%
        tally() %>%
        mutate(month = 6),
      July2018 %>%
        as.data.frame() %>%
        group_by(day) %>%
        tally() %>%
        mutate(month = 7),
      Aug2018 %>%
        as.data.frame() %>%
        group_by(day) %>%
        tally() %>%
        mutate(month = 8),
      Sep2018 %>%
        as.data.frame() %>%
        group_by(day) %>%
        tally() %>%
        mutate(month = 9),
      Oct2018 %>%
        as.data.frame() %>%
        group_by(day) %>%
        tally() %>%
        mutate(month = 10),
      Nov2018 %>%
        as.data.frame() %>%
        group_by(day) %>%
        tally() %>%
        mutate(month = 11),
      Dec2018 %>%
        as.data.frame() %>%
        group_by(day) %>%
        tally() %>%
        mutate(month = 12))
ggplot(dayTrend2018, aes(x=day, y=n, color=as.character(month))) + 
  geom_line() +
  labs(title = "Fig16.Daily Trend in year 2018",
       x = "Day",
       y = "Count",
       colour="Month") +
  plotTheme +
  theme(plot.title = element_text(size=14),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12))


About the Daily Trend in 2018, there is no clear and general trend.

2. Hourly Trend
hourTrend2018 <- rbind(March2018 %>% 
        as.data.frame() %>% 
        group_by(rhour) %>%
        tally() %>%
        mutate(month = 3), 
      Apr2018 %>%
        as.data.frame() %>%
        group_by(rhour) %>%
        tally() %>%
        mutate(month = 4),
      May2018 %>%
        as.data.frame() %>%
        group_by(rhour) %>%
        tally() %>%
        mutate(month = 5),
      Jun2018 %>%
        as.data.frame() %>%
        group_by(rhour) %>%
        tally() %>%
        mutate(month = 6),
      July2018 %>%
        as.data.frame() %>%
        group_by(rhour) %>%
        tally() %>%
        mutate(month = 7),
      Aug2018 %>%
        as.data.frame() %>%
        group_by(rhour) %>%
        tally() %>%
        mutate(month = 8),
      Sep2018 %>%
        as.data.frame() %>%
        group_by(rhour) %>%
        tally() %>%
        mutate(month = 9),
      Oct2018 %>%
        as.data.frame() %>%
        group_by(rhour) %>%
        tally() %>%
        mutate(month = 10),
      Nov2018 %>%
        as.data.frame() %>%
        group_by(rhour) %>%
        tally() %>%
        mutate(month = 11),
      Dec2018 %>%
        as.data.frame() %>%
        group_by(rhour) %>%
        tally() %>%
        mutate(month = 12))
ggplot(hourTrend2018, aes(x=rhour, y=n, color=as.character(month))) + 
  geom_line() +
  labs(title = "Fig17.Hourly Trend in year 2018",
       x = "Hour",
       y = "Count",
       colour="Month") +
  plotTheme +
  theme(plot.title = element_text(size=14),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12))


From the Hourly Trend graph in 2018, the bike trips peak at noon and at late night.

3.Weekday Trend
wdayTrend2018 <- rbind(March2018 %>% 
        as.data.frame() %>% 
        mutate(wday = wday(time)) %>%
        group_by(wday) %>%
        tally() %>%
        mutate(month = 3), 
      Apr2018 %>%
        as.data.frame() %>%
        mutate(wday = wday(time)) %>%
        group_by(wday) %>%
        tally() %>%
        mutate(month = 4),
      May2018 %>%
        as.data.frame() %>%
        mutate(wday = wday(time)) %>%
        group_by(wday) %>%
        tally() %>%
        mutate(month = 5),
      Jun2018 %>%
        as.data.frame() %>%
        mutate(wday = wday(time)) %>%
        group_by(wday) %>%
        tally() %>%
        mutate(month = 6),
      July2018 %>%
        as.data.frame() %>%
        mutate(wday = wday(time)) %>%
        group_by(wday) %>%
        tally() %>%
        mutate(month = 7),
      Aug2018 %>%
        as.data.frame() %>%
        mutate(wday = wday(time)) %>%
        group_by(wday) %>%
        tally() %>%
        mutate(month = 8),
      Sep2018 %>%
        as.data.frame() %>%
        mutate(wday = wday(time)) %>%
        group_by(wday) %>%
        tally() %>%
        mutate(month = 9),
      Oct2018 %>%
        as.data.frame() %>%
        mutate(wday = wday(time)) %>%
        group_by(wday) %>%
        tally() %>%
        mutate(month = 10),
      Nov2018 %>%
        as.data.frame() %>%
        mutate(wday = wday(time)) %>%
        group_by(wday) %>%
        tally() %>%
        mutate(month = 11),
      Dec2018 %>%
        as.data.frame() %>%
        mutate(wday = wday(time)) %>%
        group_by(wday) %>%
        tally() %>%
        mutate(month = 12))
ggplot(wdayTrend2018, aes(x=wday, y=n, color=as.character(month))) + 
  geom_line() +
  scale_x_continuous(breaks = seq(0, 7, by = 1)) +
  labs(title = "Fig18.Weekday Trend in year 2018",
       x = "Day of Week",
       y = "Count",
       colour="Month") +
  plotTheme +
  theme(plot.title = element_text(size=14),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12))


If we view the data from the day of week, there is no clear trend about the data. Overall, the number of trips peak at noon and at night, and the later months tend to have more trips than earlier months. There is no clear trend how the data vary among days or days of week.

Borough Level Analysis


Because Manhanttan as our major study hour, we also visualize the density of trips on each roads in Manhanttan, in July 2018 and July 2019.
Due to the size of the dataset, we only choose July 22 pm as our analyzing time.Then, we narrow our data into Manhattan. Because the redundacy of process, the following part just show how we did the process but will not run the code. Later, the processed data will be directly imported.

#import borough Manhanttan
borough <- st_read("E:/NYCDOT/Borough Boundaries/geo_export_4c045526-dcb9-4e1a-b602-59b848e20e6a.shp") %>%
  filter(boro_name == "Manhattan") %>%
  st_transform(st_crs(July2018))
Manhatan_line <- st_cast(borough, "MULTILINESTRING")

#import bike lane data
bike15 <- st_read("E:/NYCDOT/nyclion_20d/lion_buffer15/lion_buffer15.shp") %>%
  st_transform(st_crs(July2018)) %>%
  select(Street, FeatureTyp, SegmentTyp, FaceCode, SeqNum, StreetCode, LGC1, SegmentID,
         SHAPE_Leng, SHAPE_Area, geometry)

bike15M <- bike15[borough,]
##===Import data===##
#import July2018, and July2019 data
#July2018 <- st_read("E:/NYCDOT/rideData_cleaned/July2018/July18_cleaned.shp")
#July2019 <- st_read("E:/NYCDOT/rideData_cleaned/July2019/July19_cleaned.shp")


#Due to the size of dataset, only select rhour==22
#July18_22pm <- July2018 %>%
  #filter(rhour == 22)
#July19_22pm <- July2019 %>%
  #filter(rhour == 22)

##==select data only in Manhanttan==##

#find out intersection points
#July18_intersect <- July18_22pm[Manhatan_line,]
#July19_intersect <- July19_22pm[Manhatan_line,]

#only in Manhattan
#July18M_22pm <- July18_22pm[borough,] %>%
  #filter(!id %in% July18_intersect$id)
#July19M_22pm <- July19_22pm[borough,] %>%
  #filter(!id %in% July19_intersect$id)

#check
#mapview(July18M_22pm)
#mapview(July19M_22pm)



##==change trip data's geometry from linestring to points==#
#save the file
#st_write(July18M_22pm, "July18M_22pm.shp", driver = "ESRI Shapefile")
#st_write(July19M_22pm, "July19M_22pm.shp", driver = "ESRI Shapefile")


Visualization
# import results after using ArcGIS (by 150m)
pJuly18M_22pm <- st_read("E:/NYCDOT/riderpoint_shp/July/generated/July18M_22pm_pnts.shp")
pJuly19M_22pm <- st_read("E:/NYCDOT/riderpoint_shp/July/generated/July19M_22pm_pnts.shp")


##==Select points within the roads==##
inrpnts_July18 <- pJuly18M_22pm[bike15M,]
inrpnts_July19 <- pJuly19M_22pm[bike15M,]

#allocate Street Name on roads
inrpnts_July18 <- st_join(inrpnts_July18, bike15M, join = st_intersects, left = TRUE)
inrpnts_July19 <- st_join(inrpnts_July19, bike15M, join = st_intersects, left = TRUE)

##==group points inside same street as linestring==#
#Reference:https://stackoverflow.com/questions/50908771/create-multilines-from-points-grouped-by-id-with-sf-package
pnts2Line_18 <- inrpnts_July18 %>%
  group_by(id, Street) %>%
  summarise(do_union = FALSE) %>%
  st_cast("LINESTRING")

pnts2Line_19 <- inrpnts_July19 %>%
  group_by(id, Street) %>%
  summarise(do_union = FALSE) %>%
  st_cast("LINESTRING")

#Count number of trips on each street
ntrips_July18 <- pnts2Line_18 %>%
  st_drop_geometry() %>%
  group_by(Street) %>%
  summarise(Count = n())

ntrips_July19 <- pnts2Line_19 %>%
  st_drop_geometry() %>%
  group_by(Street) %>%
  summarise(Count = n())


##==bind the trips to road table==##
road_18 <- merge(bike15M %>% select(Street, geometry), ntrips_July18, by = ("Street"), all.x=TRUE)
road_19 <- merge(bike15M %>% select(Street, geometry), ntrips_July19, by = ("Street"), all.x=TRUE)

#add year
road_18$year <- 2018
road_19$year <- 2019
##==visualize==##
ggplot() +
  geom_sf(data = rbind(road_18, road_19), aes(color = Count), alpha = 0.9)+ 
  facet_wrap(~year) +
  scale_colour_viridis(direction = -1,
                       discrete = FALSE, option="viridis",
                       na.value="#D4D4D4") +
  labs(title = "Fig19.Bike Trip Counts in July 22pm") +
  mapTheme()


From the figure, we could see that 2019 have much more data than 2018 overall. Also, bike lanes where have more trips in 2018 also have more trips in 2019.

Trips Comparsion on Added Bike Lanes
##==Find out the number of trips on new added bike lanes==##
#import data of new bike lanes (buffer 15)
bike18new <- st_read("E:/NYCDOT/Feb23/data/new18_clean_buffered/new_18_clean_15ft.shp") %>%
  st_transform(st_crs(July2018))

#assign Street Name to new bike lanes
bike18new_M <- bike18new %>%
  filter(Borough == "Manhattan")

#change column name
colnames(bike18new_M)[1] <- "Street"


#Find the trip points overlap with roads
tripsinNew18 <- pJuly18M_22pm[bike18new_M,]
tripsinNew19 <- pJuly19M_22pm[bike18new_M,]

#allocate road name on pnts
newtrips18 <- st_join(tripsinNew18, bike18new_M %>% select(Street, geometry),
                      join = st_intersects, left = TRUE) %>% na.omit()

newtrips19 <- st_join(tripsinNew19, bike18new_M %>% select(Street, geometry),
                      join = st_intersects, left = TRUE) %>% na.omit()



#group points inside same street as linestring
new.pnts2Line18 <- newtrips18 %>%
  group_by(id, Street) %>%
  summarise(do_union = FALSE) %>%
  st_cast("LINESTRING")

new.pnts2Line19 <- newtrips19 %>%
  group_by(id, Street) %>%
  summarise(do_union = FALSE) %>%
  st_cast("LINESTRING")

#Count
n.new18 <- new.pnts2Line18 %>%
  st_drop_geometry() %>%
  group_by(Street) %>%
  summarise(Count = n()) %>%
  mutate(year = 2018)

n.new19 <- new.pnts2Line19 %>%
  st_drop_geometry() %>%
  group_by(Street) %>%
  summarise(Count = n()) %>%
  mutate(year = 2019)
#Create a summarise table
#Outer join table
new.num <- merge(x = n.new18, y = n.new19, by = "Street", all = TRUE)

#assign NA to 0
new.num[is.na(new.num)] <- 0

#Find out Diff
new.num <- new.num %>%
  mutate(Diff = Count.y - Count.x)

The following table shows the number of added trips on added bike lanes.

new.num %>% 
  select(Street, Diff) %>% 
  kbl(caption='Table4.Added Trips on added bike lanes') %>%
  kable_styling()
Table4.Added Trips on added bike lanes
Street Diff
2 AVENUE 239
7 AVENUE 228
9 AVENUE 79
COLUMBUS AVENUE 24
EAST 110 STREET 17
EAST 111 STREET 7
EAST 126 STREET 3
EAST 128 STREET -3
PEARL STREET 10
WEST 43 STREET 50
WILLIAM STREET 5

A bar graph is also made to visualize how many more trips on the added bike lanes.

ggplot(data=new.num, aes(x=Street, y=Diff)) +
  geom_bar(stat="identity", fill="#99CCFF", color="#66B2FF") +
  theme(
    # Remove grid lines
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    # Change axis line
    axis.title.x=element_blank(),
    axis.ticks.x=element_blank()
  ) +
  labs(title = "Fig20.Trip Differences of Added Bike Lanes",
       subtitle = "Between year 2018 and year 2019 during July 22pm",
       y = "Trip Differences") + plotTheme


Zoom in on Neighbhorhoods

Chelsea & East Harlem


In the part, we select two neighborhoods to see their bike ridership trends in July 2018 and 2019. The first neighborhood is Chelsea, and the second is East Harlem. Chelsea lies on the west side of Manhattan borough. It is a typical representative of Manhattan, which has a high density built environment, large population, and highly diversed art and culture. East Harlem is more residential, we pick this neighborhood because it may be more similar to other residential boroughs in the NYC.
We use trip data in July from 2018 and 2019 for this part, because that weather in July is most suitable for biking.

#added lanes in neighborhood
newLane_chel <- bike18new_M[chelsea,] %>% select(Street,geometry) %>% mutate(isNew = 1)
newLane_har <- bike18new_M[eastHarlem,]%>% select(Street,geometry) %>% group_by(Street) %>% tally() %>% mutate(isNew = 1) %>% select(-n)

First, Let’s see the ridership trend by hour and by day. As shown in the plots below, ridership in both neighborhood increase from 2018 to 2019. The ridership in Chelsea is much higher than that in East Harlem and the daily peaks are more obvious. The frequency of bike use in a month increase in both neighborhoods.

grid.arrange(top="Fig21.Ridership Trend by Hour and by Day",
  ggplot(chel_07_2018 %>% 
           st_drop_geometry() %>%
           mutate(time = ymd_hms(rcrd_tm),
                  interval60 = floor_date(time,unit = "hour")) %>% 
           select(-time) %>% group_by(interval60) %>% tally() %>% 
           mutate(year = 2018))+geom_line(aes(x=interval60,y=n))+
    labs(title = "Ridership pr hour pr day,Chelsea,July,2018")+plotTheme,
  ggplot(chel_07_2019 %>% 
           st_drop_geometry() %>%
           mutate(time = ymd_hms(rcrd_tm),
                  interval60 = floor_date(time,unit = "hour")) %>% 
           select(-time) %>% group_by(interval60) %>% tally() %>% 
           mutate(year = 2019))+geom_line(aes(x=interval60,y=n))+
    labs(title = "Ridership pr hour pr day,Chelsea,July,2019")+plotTheme,
  ggplot(Har_07_2018 %>% 
           st_drop_geometry() %>%
           mutate(time = ymd_hms(rcrd_tm),
                  interval60 = floor_date(time,unit = "hour")) %>% 
           select(-time) %>% group_by(interval60) %>% tally() %>% 
           mutate(year = 2018))+geom_line(aes(x=interval60,y=n))+
    labs(title = "Ridership pr hour pr day,East Harlem,July,2018")+plotTheme,
  ggplot(Har_07_2019 %>% 
           st_drop_geometry() %>%
           mutate(time = ymd_hms(rcrd_tm),
                  interval60 = floor_date(time,unit = "hour")) %>% 
           select(-time) %>% group_by(interval60) %>% tally() %>% 
           mutate(year = 2019))+geom_line(aes(x=interval60,y=n))+
    labs(title = "Ridership pr hour pr day,East Harlem,July,2019")+plotTheme
)


The graph below shows the hourly ridership in a day. This plot shows that in Chelsea, the morning peak hour is around 11 am and the night peak hour is around 10 pm. The daily periodicity in Chelsea become more obvious in 2019, which may indicate that more people use bike to commute to work in their daily life. However, although the ridership in East Harlem also increase, there is no obvious periodicity in a day.

# bike ridership by hour 
ggplot(rbind(
  chel_07_2018 %>%
         st_drop_geometry() %>%
         group_by(rhour) %>% tally() %>%
    mutate(year = "2018",
           neighborhood = "Chelsea",
           gr = paste(year,neighborhood,sep = "_")),
  chel_07_2019 %>%
    st_drop_geometry() %>%
    group_by(rhour) %>% tally() %>%
    mutate(year = "2019",
           neighborhood = "Chelsea",
           gr = paste(year,neighborhood,sep = "_")),
  Har_07_2018 %>%
    st_drop_geometry() %>%
    group_by(rhour) %>% tally() %>%
    mutate(year = "2018",
           neighborhood = "East Harlem",
           gr = paste(year,neighborhood,sep = "_")),
  Har_07_2019 %>%
    st_drop_geometry() %>%
    group_by(rhour) %>% tally() %>%
    mutate(year = "2019",
           neighborhood = "East Harlem",
           gr = paste(year,neighborhood,sep = "_"))))+
    geom_line(aes(x=rhour,y = n, group = gr,color = gr,binwidth = 1))+plotTheme+
  labs(title="Fig22.Ridership pr hour in a day", y="Count", x="Hour", color="Labels")

This plot shows bike ridership by the day of week.The peak bicycle use does not occur on weekends in both neighborhoods.

# bike ridership by day of week
ggplot(rbind(
  chel_07_2018 %>%
    st_drop_geometry() %>%
    mutate(dotw = wday(rcrd_tm,label = T)) %>% 
    group_by(dotw) %>% tally() %>%
    mutate(year = "2018",
           neighborhood = "Chelsea",
           gr = paste(year,neighborhood,sep = "_")),
  chel_07_2019 %>%
    st_drop_geometry() %>%
    mutate(dotw = wday(rcrd_tm,label = T)) %>% 
    group_by(dotw) %>% tally() %>%
    mutate(year = "2019",
           neighborhood = "Chelsea",
           gr = paste(year,neighborhood,sep = "_")),
  Har_07_2018 %>%
    st_drop_geometry() %>%
    mutate(dotw = wday(rcrd_tm,label = T)) %>% 
    group_by(dotw) %>% tally() %>%
    mutate(year = "2018",
           neighborhood = "East Harlem",
           gr = paste(year,neighborhood,sep = "_")),
  Har_07_2019 %>%
    st_drop_geometry() %>%
    mutate(dotw = wday(rcrd_tm,label = T)) %>% 
    group_by(dotw) %>% tally() %>%
    mutate(year = "2019",
           neighborhood = "East Harlem",
           gr = paste(year,neighborhood,sep = "_"))))+
  geom_line(aes(x=dotw,y = n, group = gr,color = gr,binwidth = 1))+plotTheme +
  labs(title="Fig23.Ridership by Weekday", y="Count", x="Day of Week", color="Labels")

Next, we study the ridership changes in two neighborhoods seperately.

This graph compares the ridership in Chelsea in two years. As you can see, the increase of ridership mainly happens in the south west part of the neighborhood. For example, the west 22 ST had a monthly ridership of 1020 in 2019. The riverside roads also see an obvious increase between these two years.

# bike trip in 2018.2019, Chelsea and East Harlem
chel18 <- road15_chel %>% 
  select(Street,trip18,geometry) %>% 
  mutate(Year = 2018) %>% 
  rename(trip=trip18)

chel19 <- road15_chel %>% 
  select(Street,trip19,geometry) %>% 
  mutate(Year = 2019) %>% 
  rename(trip=trip19)

har18 <- road15_har %>% 
  select(Street,trip18,geometry) %>% 
  mutate(Year = 2018) %>% 
  rename(trip=trip18)

har19 <- road15_har %>% 
  select(Street,trip19,geometry) %>% 
  mutate(Year = 2019) %>% 
  rename(trip=trip19)

har_Diff1819 <- road15_har %>% 
  select(Street,trip18,trip19,geometry) %>% 
  mutate(tripDiff = trip19-trip18) %>% 
  select(-trip18,-trip19)

har_Diff1819 <- road15_har %>% 
  select(Street,trip18,trip19,geometry) %>% 
  mutate(tripDiff = trip19-trip18) %>% 
  select(-trip18,-trip19)

har_Diff1819 <- road15_har %>% 
  select(Street,trip18,trip19,geometry) %>% 
  mutate(tripDiff = trip19-trip18,
         Neighborhood = "East Harlem") %>% 
  select(-trip18,-trip19) %>% 
  merge(.,newLane_har %>%
          st_drop_geometry(),by.x="Street",by.y="Street",all.x=T) %>%
  replace(.,is.na(.),0) 

chel_Diff1819 <- road15_chel %>% 
  select(Street,trip18,trip19,geometry) %>% 
  mutate(tripDiff = trip19-trip18,
         Neighborhood = "Chelsea") %>% 
  select(-trip18,-trip19) %>% 
  merge(.,newLane_chel %>%
          st_drop_geometry(),by.x="Street",by.y="Street",all.x=T) %>%
  replace(.,is.na(.),0) 

#Chelsea
ggplot() + 
  geom_sf(data = chelsea,fill = "#f7f5f5",col = NA)+
  geom_sf(data = rbind(chel18,chel19),aes(color = trip))+
  scale_color_viridis(direction = 1,
                      discrete = F, option = "viridis",
                      na.value = "#D4D4D4")+
  labs(title = "Fig24.Trip Count per road, Chelsea, July") + facet_wrap(~Year)+ mapTheme()


Then we take a closer look at the bicycle use in Chelsea. This map visualize the difference in ridership between 2018 and 2019. Although most roads see increase from two years. There are some road where the ridership drops in 2019. For instance, the west 16th st, had a drop of 101 ridership in July. The ridership in west 25th and 36th st also dropped.

We also pay attention to the ridership change on newly added bike lanes. As you can see on the right map, the ridership see great increase on both 7th Ave and 9th Ave, which had new bike lanes in 2018.

# Diff in two years
grid.arrange(
  ggplot()+
  geom_sf(data = chelsea,fill = "#f7f5f5",col = NA)+
  geom_sf(data = chel_Diff1819,aes(color = tripDiff))+
  scale_color_viridis(direction = 1,
                      discrete = F, option = "viridis",
                      na.value = "#D4D4D4")+
  labs(title = "Fig25.Ridership increase,
       Chelsea, 2018-2019") +
  mapTheme(),
  ggplot()+
  geom_sf(data = chelsea,fill = "#f7f5f5",col = NA)+
  geom_sf(data = chel_Diff1819[chel_Diff1819$isNew==1,],aes(color = tripDiff))+
  scale_color_viridis(direction = 1,
                      discrete = F, option = "viridis",
                      na.value = "#D4D4D4")+
  labs(title = "Fig26.Ridership increase on new 
  bikelanes, Chelsea, 2018-2019") +
  mapTheme(),
  ncol=2
)


In East Harlem, the ridership are more concentrated on several roads. The PARK AVENUE has the highest ridership of 311 in July, 2019. Other connecting roads around park avenue also see increase in bike ridership, however, the increase is relatively small compared with that on Park Avenue.

#East Harlem
ggplot() + 
  geom_sf(data = eastHarlem,fill = "#f7f5f5",col = NA)+
  geom_sf(data = rbind(har18,har19),aes(color = trip))+
  scale_color_viridis(direction = 1,
                      discrete = F, option = "viridis",
                      na.value = "#D4D4D4")+
  labs(title = "Fig27.Trip Count per road in July, East Harlem") +
  facet_wrap(~Year)+
  mapTheme()


When we see the ridership difference between two years, we can see the increase in ridership also concentrated on Park AVE. There are also some increase on new added bike lanes.

grid.arrange(
  ggplot()+
  geom_sf(data = eastHarlem,fill = "#f7f5f5",col = NA)+
  geom_sf(data = har_Diff1819,aes(color = tripDiff))+
  scale_color_viridis(direction = 1,
                      discrete = F, option = "viridis",
                      na.value = "#D4D4D4")+
     labs(title = "Fig28.Ridership increase, East Harlem, 
       2018-2019") +
  mapTheme(),
  ggplot()+
  geom_sf(data = eastHarlem,fill = "#f7f5f5",col = NA)+
  geom_sf(data = har_Diff1819[har_Diff1819$isNew==1,],aes(color = tripDiff))+
  scale_color_viridis(direction = 1,
                      discrete = F, option = "viridis",
                      na.value = "#D4D4D4")+
  labs(title = "Fig29.Ridership increase on new 
  bikelanes, East Harlem,
  2018-2019") +
  mapTheme(),
  ncol=2
)